In [63]:

    
import pandas as pd

def preview(df):
    print("Dimensions: {0} rows x {1} columns".format(df.shape[0], df.shape[1]))
    return df.head()

jeopardy = pd.read_csv("jeopardy.csv")
preview(jeopardy)









    



Dimensions: 216930 rows x 7 columns






    Out[63]:






  
    
      
      Show Number
      Air Date
      Round
      Category
      Value
      Question
      Answer
    
  
  
    
      0
      4680
      2004-12-31
      Jeopardy!
      HISTORY
      $200
      For the last 8 years of his life, Galileo was ...
      Copernicus
    
    
      1
      4680
      2004-12-31
      Jeopardy!
      ESPN's TOP 10 ALL-TIME ATHLETES
      $200
      No. 2: 1912 Olympian; football star at Carlisl...
      Jim Thorpe
    
    
      2
      4680
      2004-12-31
      Jeopardy!
      EVERYBODY TALKS ABOUT IT...
      $200
      The city of Yuma in this state has a record av...
      Arizona
    
    
      3
      4680
      2004-12-31
      Jeopardy!
      THE COMPANY LINE
      $200
      In 1963, live on "The Art Linkletter Show", th...
      McDonald's
    
    
      4
      4680
      2004-12-31
      Jeopardy!
      EPITAPHS & TRIBUTES
      $200
      Signer of the Dec. of Indep., framer of the Co...
      John Adams



In [64]:

    
print(jeopardy.columns)

# Rename columns
jeopardy.rename(columns=lambda name: name.lstrip(), inplace=True)









    



Index([u'Show Number', u' Air Date', u' Round', u' Category', u' Value',
       u' Question', u' Answer'],
      dtype='object')



In [65]:

    
print(jeopardy.columns)









    



Index([u'Show Number', u'Air Date', u'Round', u'Category', u'Value',
       u'Question', u'Answer'],
      dtype='object')

Normalizing text



In [66]:

    
import string

def norm_words(words):
    words = words.lower().translate(None, string.punctuation)
    return words

jeopardy["clean_question"] = jeopardy["Question"].apply(norm_words)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(norm_words)

jeopardy.head()









    Out[66]:






  
    
      
      Show Number
      Air Date
      Round
      Category
      Value
      Question
      Answer
      clean_question
      clean_answer
    
  
  
    
      0
      4680
      2004-12-31
      Jeopardy!
      HISTORY
      $200
      For the last 8 years of his life, Galileo was ...
      Copernicus
      for the last 8 years of his life galileo was u...
      copernicus
    
    
      1
      4680
      2004-12-31
      Jeopardy!
      ESPN's TOP 10 ALL-TIME ATHLETES
      $200
      No. 2: 1912 Olympian; football star at Carlisl...
      Jim Thorpe
      no 2 1912 olympian football star at carlisle i...
      jim thorpe
    
    
      2
      4680
      2004-12-31
      Jeopardy!
      EVERYBODY TALKS ABOUT IT...
      $200
      The city of Yuma in this state has a record av...
      Arizona
      the city of yuma in this state has a record av...
      arizona
    
    
      3
      4680
      2004-12-31
      Jeopardy!
      THE COMPANY LINE
      $200
      In 1963, live on "The Art Linkletter Show", th...
      McDonald's
      in 1963 live on the art linkletter show this c...
      mcdonalds
    
    
      4
      4680
      2004-12-31
      Jeopardy!
      EPITAPHS & TRIBUTES
      $200
      Signer of the Dec. of Indep., framer of the Co...
      John Adams
      signer of the dec of indep framer of the const...
      john adams

Normalizing columns



In [67]:

    
def norm_value(value):
    try:
        value = int(value.translate(None, string.punctuation))
    except:
        value = 0
    return value

jeopardy["clean_value"] = jeopardy["Value"].apply(norm_value)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

print(jeopardy.dtypes)
jeopardy.head()









    



Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object






    Out[67]:






  
    
      
      Show Number
      Air Date
      Round
      Category
      Value
      Question
      Answer
      clean_question
      clean_answer
      clean_value
    
  
  
    
      0
      4680
      2004-12-31
      Jeopardy!
      HISTORY
      $200
      For the last 8 years of his life, Galileo was ...
      Copernicus
      for the last 8 years of his life galileo was u...
      copernicus
      200
    
    
      1
      4680
      2004-12-31
      Jeopardy!
      ESPN's TOP 10 ALL-TIME ATHLETES
      $200
      No. 2: 1912 Olympian; football star at Carlisl...
      Jim Thorpe
      no 2 1912 olympian football star at carlisle i...
      jim thorpe
      200
    
    
      2
      4680
      2004-12-31
      Jeopardy!
      EVERYBODY TALKS ABOUT IT...
      $200
      The city of Yuma in this state has a record av...
      Arizona
      the city of yuma in this state has a record av...
      arizona
      200
    
    
      3
      4680
      2004-12-31
      Jeopardy!
      THE COMPANY LINE
      $200
      In 1963, live on "The Art Linkletter Show", th...
      McDonald's
      in 1963 live on the art linkletter show this c...
      mcdonalds
      200
    
    
      4
      4680
      2004-12-31
      Jeopardy!
      EPITAPHS & TRIBUTES
      $200
      Signer of the Dec. of Indep., framer of the Co...
      John Adams
      signer of the dec of indep framer of the const...
      john adams
      200

Answers in questions



In [73]:

    
def ans_in_q(row):
    match_count = 0
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    
    try:
        split_answer.remove("the")
    except:
        pass
    
    if len(split_answer) == 0:
        return 0
    else:
        for word in split_answer:
            if word in split_question:
                match_count += 1
        return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(ans_in_q, axis=1)
print(jeopardy["answer_in_question"].mean())
jeopardy[jeopardy["answer_in_question"] > 0].head()









    



0.00643064583045






    Out[73]:






  
    
      
      Show Number
      Air Date
      Round
      Category
      Value
      Question
      Answer
      clean_question
      clean_answer
      clean_value
      answer_in_question
    
  
  
    
      266
      4931
      2006-02-06
      Double Jeopardy!
      NOT A CURRENT NATIONAL CAPITAL
      $400
      Ljubljana, Bratislava, Barcelona
      Barcelona
      ljubljana bratislava barcelona
      barcelona
      400
      1
    
    
      272
      4931
      2006-02-06
      Double Jeopardy!
      NOT A CURRENT NATIONAL CAPITAL
      $800
      Istanbul, Ottawa, Amman
      Istanbul
      istanbul ottawa amman
      istanbul
      800
      1
    
    
      278
      4931
      2006-02-06
      Double Jeopardy!
      NOT A CURRENT NATIONAL CAPITAL
      $1200
      Sofia, Sarajevo, Saigon
      Saigon
      sofia sarajevo saigon
      saigon
      1200
      1
    
    
      284
      4931
      2006-02-06
      Double Jeopardy!
      NOT A CURRENT NATIONAL CAPITAL
      $1600
      Bucharest, Bonn, Bern
      Bonn
      bucharest bonn bern
      bonn
      1600
      1
    
    
      290
      4931
      2006-02-06
      Double Jeopardy!
      NOT A CURRENT NATIONAL CAPITAL
      $2000
      Belize City, Guatemala City, Panama City
      Belize City
      belize city guatemala city panama city
      belize city
      2000
      1



In [85]:

    
jeopardy[(jeopardy["answer_in_question"] > 0) & (jeopardy["clean_question"].apply(string.split).apply(len) > 6)].head()









    Out[85]:






  
    
      
      Show Number
      Air Date
      Round
      Category
      Value
      Question
      Answer
      clean_question
      clean_answer
      clean_value
      answer_in_question
    
  
  
    
      1137
      1279
      1990-03-08
      Jeopardy!
      PEANUTS
      $200
      Of a 25th, 30th or 40th anniversary, what "Pea...
      40th Anniversary
      of a 25th 30th or 40th anniversary what peanut...
      40th anniversary
      200
      1
    
    
      1840
      3113
      1998-02-25
      Double Jeopardy!
      TAKE A GUESS
      $600
      Of a pogo stick injury, a dense winter fog or ...
      a dense winter fog
      of a pogo stick injury a dense winter fog or t...
      a dense winter fog
      600
      1
    
    
      2347
      4595
      2004-07-23
      Jeopardy!
      BIRD HUNTING
      $800
      The third rail in a subway system is the one w...
      a rail
      the third rail in a subway system is the one w...
      a rail
      800
      1
    
    
      2572
      4220
      2002-12-27
      Jeopardy!
      THE PLANET URANUS
      $400
      Of 84, 184 or 284, the length in years of one ...
      84
      of 84 184 or 284 the length in years of one or...
      84
      400
      1
    
    
      4163
      4213
      2002-12-18
      Double Jeopardy!
      MUD
      $400
      Of an artist, a fish, or a wasp, it's what a m...
      wasp
      of an artist a fish or a wasp its what a mud d...
      wasp
      400
      1

Only 0.6% of the answers appear in the questions itself. Out of this 0.6%, a sample of the questions shows that they are all multiple choice questions, which concludes that it is very unlikely that the answer will be in the question itself.

Recycled questions



In [102]:

    
jeopardy = jeopardy.sort_values(by="Air Date")

question_overlap = []
terms_used = set()

for index, row in jeopardy.iterrows():
    match_count = 0
    split_question = row["clean_question"].split(" ")
    for word in split_question:
        if len(word) < 6:
            split_question.remove(word)
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= float(len(split_question))
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap
print(jeopardy["question_overlap"].mean())









    



0.928425630164



In [105]:

    
jeopardy.tail()









    Out[105]:






  
    
      
      Show Number
      Air Date
      Round
      Category
      Value
      Question
      Answer
      clean_question
      clean_answer
      clean_value
      answer_in_question
      question_overlap
    
  
  
    
      105940
      6300
      2012-01-27
      Jeopardy!
      THE TRUTH LIES THEREIN
      $400
      Old school GPS on a shopping mall map:  "You A...
      here
      old school gps on a shopping mall map  you are...
      here
      400
      0
      1.000000
    
    
      105933
      6300
      2012-01-27
      Jeopardy!
      LESSER-KNOWN SCIENTISTS
      $200
      In 1779 Dutch scientist Jan Ingenhousz publish...
      photosynthesis
      in 1779 dutch scientist jan ingenhousz publish...
      photosynthesis
      200
      0
      0.888889
    
    
      105935
      6300
      2012-01-27
      Jeopardy!
      VISITING THE CITY
      $400
      First the Royal Ontario Museum, then for lunch...
      Toronto
      first the royal ontario museum then for lunch ...
      toronto
      400
      0
      1.000000
    
    
      105951
      6300
      2012-01-27
      Jeopardy!
      LESSER-KNOWN SCIENTISTS
      $800
      Joseph Lagrange insisted on 10 as the basic un...
      the metric system
      joseph lagrange insisted on 10 as the basic un...
      the metric system
      800
      0
      0.833333
    
    
      105930
      6300
      2012-01-27
      Jeopardy!
      PANTS
      $200
      A synonym for freight, or pants with large bel...
      cargo pants
      a synonym for freight or pants with large bell...
      cargo pants
      200
      0
      0.909091

Low value vs high value questions



In [106]:

    
def value(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value

jeopardy["high_value"] = jeopardy.apply(value, axis=1)
jeopardy.head()









    Out[106]:






  
    
      
      Show Number
      Air Date
      Round
      Category
      Value
      Question
      Answer
      clean_question
      clean_answer
      clean_value
      answer_in_question
      question_overlap
      high_value
    
  
  
    
      84523
      1
      1984-09-10
      Jeopardy!
      LAKES & RIVERS
      $100
      River mentioned most often in the Bible
      the Jordan
      river mentioned most often in the bible
      the jordan
      100
      0
      0.000000
      0
    
    
      84544
      1
      1984-09-10
      Jeopardy!
      ANIMALS
      $500
      If this species of hybrid's parents were rever...
      a mule
      if this species of hybrids parents were revers...
      a mule
      500
      0
      0.000000
      0
    
    
      84543
      1
      1984-09-10
      Jeopardy!
      LAKES & RIVERS
      $500
      World's largest lake, nearly 5 times as big as...
      the Caspian Sea
      worlds largest lake nearly 5 times as big as s...
      the caspian sea
      500
      0
      0.000000
      0
    
    
      84542
      1
      1984-09-10
      Jeopardy!
      ACTORS & ROLES
      $400
      The blonde preferred in the film "Gentlemen Pr...
      Marilyn Monroe
      the blonde preferred in the film gentlemen pre...
      marilyn monroe
      400
      0
      0.166667
      0
    
    
      84553
      1
      1984-09-10
      Double Jeopardy!
      NATIONAL LANDMARKS
      $400
      When he was home, George Washington slept here
      Mount Vernon
      when he was home george washington slept here
      mount vernon
      400
      0
      0.000000
      0



In [ ]:

	Show Number	Air Date	Round	Category	Value	Question	Answer
0	4680	2004-12-31	Jeopardy!	HISTORY	$200	For the last 8 years of his life, Galileo was ...	Copernicus
1	4680	2004-12-31	Jeopardy!	ESPN's TOP 10 ALL-TIME ATHLETES	$200	No. 2: 1912 Olympian; football star at Carlisl...	Jim Thorpe
2	4680	2004-12-31	Jeopardy!	EVERYBODY TALKS ABOUT IT...	$200	The city of Yuma in this state has a record av...	Arizona
3	4680	2004-12-31	Jeopardy!	THE COMPANY LINE	$200	In 1963, live on "The Art Linkletter Show", th...	McDonald's
4	4680	2004-12-31	Jeopardy!	EPITAPHS & TRIBUTES	$200	Signer of the Dec. of Indep., framer of the Co...	John Adams

	Show Number	Air Date	Round	Category	Value	Question	Answer	clean_question	clean_answer	clean_value	answer_in_question
266	4931	2006-02-06	Double Jeopardy!	NOT A CURRENT NATIONAL CAPITAL	$400	Ljubljana, Bratislava, Barcelona	Barcelona	ljubljana bratislava barcelona	barcelona	400	1
272	4931	2006-02-06	Double Jeopardy!	NOT A CURRENT NATIONAL CAPITAL	$800	Istanbul, Ottawa, Amman	Istanbul	istanbul ottawa amman	istanbul	800	1
278	4931	2006-02-06	Double Jeopardy!	NOT A CURRENT NATIONAL CAPITAL	$1200	Sofia, Sarajevo, Saigon	Saigon	sofia sarajevo saigon	saigon	1200	1
284	4931	2006-02-06	Double Jeopardy!	NOT A CURRENT NATIONAL CAPITAL	$1600	Bucharest, Bonn, Bern	Bonn	bucharest bonn bern	bonn	1600	1
290	4931	2006-02-06	Double Jeopardy!	NOT A CURRENT NATIONAL CAPITAL	$2000	Belize City, Guatemala City, Panama City	Belize City	belize city guatemala city panama city	belize city	2000	1

	Show Number	Air Date	Round	Category	Value	Question	Answer	clean_question	clean_answer	clean_value	answer_in_question
1137	1279	1990-03-08	Jeopardy!	PEANUTS	$200	Of a 25th, 30th or 40th anniversary, what "Pea...	40th Anniversary	of a 25th 30th or 40th anniversary what peanut...	40th anniversary	200	1
1840	3113	1998-02-25	Double Jeopardy!	TAKE A GUESS	$600	Of a pogo stick injury, a dense winter fog or ...	a dense winter fog	of a pogo stick injury a dense winter fog or t...	a dense winter fog	600	1
2347	4595	2004-07-23	Jeopardy!	BIRD HUNTING	$800	The third rail in a subway system is the one w...	a rail	the third rail in a subway system is the one w...	a rail	800	1
2572	4220	2002-12-27	Jeopardy!	THE PLANET URANUS	$400	Of 84, 184 or 284, the length in years of one ...	84	of 84 184 or 284 the length in years of one or...	84	400	1
4163	4213	2002-12-18	Double Jeopardy!	MUD	$400	Of an artist, a fish, or a wasp, it's what a m...	wasp	of an artist a fish or a wasp its what a mud d...	wasp	400	1

	Show Number	Air Date	Round	Category	Value	Question	Answer	clean_question	clean_answer	clean_value	question_overlap
105940	6300	2012-01-27	Jeopardy!	THE TRUTH LIES THEREIN	$400	Old school GPS on a shopping mall map: "You A...	here	old school gps on a shopping mall map you are...	here	400	1.000000
105933	6300	2012-01-27	Jeopardy!	LESSER-KNOWN SCIENTISTS	$200	In 1779 Dutch scientist Jan Ingenhousz publish...	photosynthesis	in 1779 dutch scientist jan ingenhousz publish...	photosynthesis	200	0.888889
105935	6300	2012-01-27	Jeopardy!	VISITING THE CITY	$400	First the Royal Ontario Museum, then for lunch...	Toronto	first the royal ontario museum then for lunch ...	toronto	400	1.000000
105951	6300	2012-01-27	Jeopardy!	LESSER-KNOWN SCIENTISTS	$800	Joseph Lagrange insisted on 10 as the basic un...	the metric system	joseph lagrange insisted on 10 as the basic un...	the metric system	800	0.833333
105930	6300	2012-01-27	Jeopardy!	PANTS	$200	A synonym for freight, or pants with large bel...	cargo pants	a synonym for freight or pants with large bell...	cargo pants	200	0.909091

	Show Number	Air Date	Round	Category	Value	Question	Answer	clean_question	clean_answer	clean_value	question_overlap
84523	1	1984-09-10	Jeopardy!	LAKES & RIVERS	$100	River mentioned most often in the Bible	the Jordan	river mentioned most often in the bible	the jordan	100	0.000000
84544	1	1984-09-10	Jeopardy!	ANIMALS	$500	If this species of hybrid's parents were rever...	a mule	if this species of hybrids parents were revers...	a mule	500	0.000000
84543	1	1984-09-10	Jeopardy!	LAKES & RIVERS	$500	World's largest lake, nearly 5 times as big as...	the Caspian Sea	worlds largest lake nearly 5 times as big as s...	the caspian sea	500	0.000000
84542	1	1984-09-10	Jeopardy!	ACTORS & ROLES	$400	The blonde preferred in the film "Gentlemen Pr...	Marilyn Monroe	the blonde preferred in the film gentlemen pre...	marilyn monroe	400	0.166667
84553	1	1984-09-10	Double Jeopardy!	NATIONAL LANDMARKS	$400	When he was home, George Washington slept here	Mount Vernon	when he was home george washington slept here	mount vernon	400	0.000000